Why Python?
Data Source: Kaggle
The mass movement of uprooted people is a highly charged geopolitical issue. This data, gathered by the UN High Commissioner for Refugees (UNHCR), covers movement of displaced persons (asylum seekers, refugees, internally displaced persons (IDP), stateless). Also included are destination country responses to asylum petitions.
This dataset includes 6 csv files covering:
Description: (1999-2016) Monthly totals about asylum applications opened in 38 European and 6 non-European countries, by month and origin. Repeat/reopened/appealed applications are largely excluded.
# Numpy is a library for numeric analysis.
import numpy as np
# Pandas is used for data analysis and data frame manipulation.
import pandas as pd
# NetworkX is a library for network analysis.
import networkx as nx
# Matplotlib is used for visualization.
import matplotlib.pyplot as plt
# Seaborn is another visualization library build on top of Matplotlib.
import seaborn as sns
sns.set()
# Plotly is used for Dynamic Visualization.
import cufflinks as cf
import plotly.plotly as py
#init_notebook_mode(connected=True)
cf.go_offline()
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Read .csv file.
## We just indicate the path where is stored.
raw_data_df = pd.read_csv('../data/refugee-data/asylum_seekers_monthly.csv')
# Let us see a sample of the data.
raw_data_df.tail(20)
# Get dimensions of the data set.
raw_data_df.shape
data_df = raw_data_df.copy()
# Remove strange characters.
data_df.replace(to_replace={'*': np.nan}, inplace=True)
# Change column names.
data_df.rename(columns = {'Country / territory of asylum/residence' : 'Country'}, inplace=True)
# Covert Year variable to character.
data_df['Year'] = data_df['Year'].astype('str')
# Include a Date column.
data_df['Date'] = data_df[['Year', 'Month']].apply(lambda x: '-'.join(x), axis=1)
data_df['Date'] = pd.to_datetime(data_df['Date'], format = '%Y-%B')
# Covert Value variable to numeric.
data_df['Value'] = pd.to_numeric(data_df['Value'], downcast='integer')
data_df.head()
data_df['Country'].unique()
data_df['Origin'].unique()
Let us plot the number of applications received per country.
fig, ax= plt.subplots(figsize=(15, 15))
data_df \
.groupby('Country', as_index=False) \
.agg({'Value' : np.sum}) \
.sort_values('Value', ascending=True) \
.plot(kind='barh',
x='Country',
y='Value',
color='blue',
ax=ax
)
ax.set(title='Target Countries Asylum Applications (1999-2016)');
Similarly, we plot the top applicant countries.
fig, ax= plt.subplots(figsize=(15, 15))
data_df \
.groupby('Origin', as_index=False) \
.agg({'Value' : np.sum}) \
.sort_values('Value', ascending=True) \
.tail(50) \
.plot(kind='barh',
x='Origin',
y='Value',
color='red',
ax=ax
)
ax.set(title='Origin Countries Asylum Applications (1999-2016)');
Let us focus on Country=Germany.
data_df \
.query('Country == "Germany"') \
.groupby('Date') \
.agg({'Value': np.sum}) \
.iplot(
title='Number of Applications to Germany',
xTitle='Date',
color='blue'
)
We can also get the country ranking with respec to received applications.
top_in_countries = data_df \
.groupby('Country', as_index=False) \
.agg({'Value' : np.sum}) \
.sort_values('Value', ascending=False) \
.head(3) \
['Country'] \
.values
data_df \
.query('Country in @top_in_countries') \
.groupby(['Country', 'Date']) \
.agg({'Value': np.sum}) \
.unstack(0) \
['Value'] \
.iplot(
title='Number of Applications to Top 3 Countries',
xTitle='Date'
)
Let us see which are to top countries applying to Germany:
germany_df = data_df \
.groupby(['Origin', 'Country'], as_index=False) \
.agg({'Value': np.sum}) \
.sort_values('Value', ascending=False)
germany_df.head()
We see that Syrian Arab Rep is te top one. Let us see the applicatioin jusr for Syrian Arab Rep as a time series:
data_df \
.query('Country == "Germany"') \
.query('Origin == "Syrian Arab Rep."') \
.query('Date > "2009-12-31"') \
.groupby('Date') \
.agg({'Value': np.sum}) \
.iplot(
title='Number of Applications to Germany from Syrian Arab Rep.',
xTitle='Date',
color='blue'
)
We see a high increase from in the period 2015 - 2016.
Question: Could have we predicted this?
Let us see if we see any inidicator using Google Trends.
# This library allow us to connect to Google Trends directly.
from pytrends.request import TrendReq
pytrends = TrendReq(tz=360)
# Let us get the Google search index data for the word "germany" in Arab.
kw_list = ['ألمانيا']
pytrends.build_payload(
kw_list=kw_list,
cat=0,
timeframe='2010-01-01 2018-01-01',
geo='SY',
gprop=''
)
google_trends_data = pytrends.interest_over_time()
syria_germany_search_df = google_trends_data.reset_index()
syria_germany_search_df.head()
# Data processing.
syria_germany_search_df = syria_germany_search_df \
.replace(to_replace={'<1': 0}) \
.assign(date = lambda x : pd.to_datetime(x['date'], format = '%Y-%m-%d')) \
.assign(ألمانيا = lambda x : pd.to_numeric(x['ألمانيا'], downcast = 'integer')) \
syria_germany_search_df.head()
syria_germany_search_df \
.query('date < "2017-06-01"') \
.iplot(
x='date',
y='ألمانيا',
title = 'Search Data for the Keyword : Germany (ألمانيا) in Syria',
color = 'red'
)
We see a remarkable peak in search in the summer of 2014.
Let us represent the top applications to Germany as a weighted network object:
Let us work on the Germany network.
network_df = data_df \
.groupby(['Origin', 'Country'], as_index=False) \
.agg({'Value': np.sum}) \
.sort_values('Value', ascending=False)
plot_network_df = network_df \
.query('Country == "Germany"') \
.head(50)
Let us encode the data as a network:
G = nx.from_pandas_edgelist(
df=plot_network_df,
source = 'Origin',
target = 'Country',
edge_attr = 'Value',
create_using = nx.DiGraph()
)
Now we plot the network (and save it as pdf).
fig, ax= plt.subplots(figsize=(35, 35))
nodes_size = [x[1]/50 for x in list(G.degree(weight='Value'))]
pos = nx.spring_layout(G, iterations=500)
nx.draw(
G=G,
pos=pos,
with_labels=True,
arrows=True,
node_color='#a2cffe',
width=0.7,
edge_color='.4',
font_size=15,
font_color='black',
node_size=nodes_size,
ax=ax
)
plt.savefig('../plots/de_network_plot.pdf')
We can also generate a more complex network:
network_df = data_df \
.groupby(['Origin', 'Country'], as_index=False) \
.agg({'Value': np.sum}) \
.sort_values('Value', ascending=False)
plot_network_df = network_df \
.head(300)
G = nx.from_pandas_edgelist(
df=plot_network_df,
source = 'Origin',
target = 'Country',
edge_attr = 'Value',
create_using = nx.DiGraph()
)
fig, ax= plt.subplots(figsize=(100, 100))
nodes_size = [x[1]/600 for x in list(G.degree(weight='Value'))]
pos = nx.spring_layout(G, iterations=500)
nx.draw(
G=G,
pos=pos,
with_labels=True,
arrows=True,
node_color='#a2cffe',
width=0.7,
edge_color='.4',
font_size=15,
font_color='black',
node_size=nodes_size,
ax=ax
)
plt.savefig('../plots/all_network_plot.pdf')